import os
import numpy as np

#For importing keypoint RCNN pretrained model and image preprocessing
import torchvision
import torch

#For reading the image
import cv2

#For visualization
import matplotlib.pyplot as plt
from torchvision.models.detection import KeypointRCNN_ResNet50_FPN_Weights

model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=KeypointRCNN_ResNet50_FPN_Weights.DEFAULT)
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)

# optionally, if you want to export the model to ONNX:
torch.onnx.export(model, x, "keypoint_rcnn.onnx", opset_version = 11)

import onnxruntime as ort
import cv2 as cv
import numpy as np
import torchvision

coco_names = {'0': 'background', '1': 'person', '2': 'bicycle', '3': 'car', '4': 'motorcycle', '5': 'airplane', '6': 'bus',
         '7': 'train', '8': 'truck', '9': 'boat', '10': 'traffic light', '11': 'fire hydrant', '13': 'stop sign',
         '14': 'parking meter', '15': 'bench', '16': 'bird', '17': 'cat', '18': 'dog', '19': 'horse', '20': 'sheep',
         '21': 'cow', '22': 'elephant', '23': 'bear', '24': 'zebra', '25': 'giraffe', '27': 'backpack',
         '28': 'umbrella', '31': 'handbag', '32': 'tie', '33': 'suitcase', '34': 'frisbee', '35': 'skis',
         '36': 'snowboard', '37': 'sports ball', '38': 'kite', '39': 'baseball bat', '40': 'baseball glove',
         '41': 'skateboard', '42': 'surfboard', '43': 'tennis racket', '44': 'bottle', '46': 'wine glass',
         '47': 'cup', '48': 'fork', '49': 'knife', '50': 'spoon', '51': 'bowl', '52': 'banana', '53': 'apple',
         '54': 'sandwich', '55': 'orange', '56': 'broccoli', '57': 'carrot', '58': 'hot dog', '59': 'pizza',
         '60': 'donut', '61': 'cake', '62': 'chair', '63': 'couch', '64': 'potted plant', '65': 'bed',
         '67': 'dining table', '70': 'toilet', '72': 'tv', '73': 'laptop', '74': 'mouse', '75': 'remote',
         '76': 'keyboard', '77': 'cell phone', '78': 'microwave', '79': 'oven', '80': 'toaster', '81': 'sink',
         '82': 'refrigerator', '84': 'book', '85': 'clock', '86': 'vase', '87': 'scissors', '88': 'teddybear',
         '89': 'hair drier', '90': 'toothbrush'}

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

sess_options = ort.SessionOptions()
# Below is for optimizing performance
sess_options.intra_op_num_threads = 24
# sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
ort_session = ort.InferenceSession("keypoint_rcnn.onnx", sess_options=sess_options,
                                   providers=['CUDAExecutionProvider'])
src = cv.imread("e:/test.jpg")
cv.namedWindow("KeyPointRCNN Detection Demo", cv.WINDOW_AUTOSIZE)
image = cv.cvtColor(src, cv.COLOR_BGR2RGB)
blob = transform(image)
c, h, w = blob.shape
input_x = blob.view(1, c, h, w)
input_y = blob.view(2, c, h, w)
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(input_x),ort_session.get_inputs()[1].name: to_numpy(input_y)}
# ort_session.run(None, {'images': to_numpy(input_data[None])})
# ort_outs = ort_session.run(None, ort_inputs)
ort_outs = ort_session.run(None, ort_inputs)
#  (N,4) dimensional array containing the absolute bounding-box
boxes = ort_outs[0]
# labels
labels = ort_outs[1]
# scores
scores = ort_outs[2]
# key_points
multi_key_points = ort_outs[3]

print(boxes.shape, boxes.dtype, labels.shape, labels.dtype, scores.shape, scores.dtype, multi_key_points.shape)

index = 0
for x1, y1, x2, y2 in boxes:
    if scores[index] > 0.5:
        cv.rectangle(src, (np.int32(x1), np.int32(y1)),
                     (np.int32(x2), np.int32(y2)), (140, 199, 0), 2, 8, 0)
        label_id = labels[index]
        label_txt = coco_names[str(label_id)]
        cv.putText(src, label_txt, (np.int32(x1), np.int32(y1)), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 1)
        kpts = np.int32(multi_key_points[index])

        # nose -> left_eye -> left_ear.(0, 1), (1, 3)
        cv.line(src, (kpts[0][0], kpts[0][1]), (kpts[1][0], kpts[1][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[1][0], kpts[1][1]), (kpts[3][0], kpts[3][1]), (255, 255, 0), 2, 8, 0)
        # nose -> right_eye -> right_ear.(0, 2), (2, 4)
        cv.line(src, (kpts[0][0], kpts[0][1]), (kpts[2][0], kpts[2][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[2][0], kpts[2][1]), (kpts[4][0], kpts[4][1]), (255, 255, 0), 2, 8, 0)
        # nose -> left_shoulder -> left_elbow -> left_wrist.(0, 5), (5, 7), (7, 9)
        cv.line(src, (kpts[0][0], kpts[0][1]), (kpts[5][0], kpts[5][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[5][0], kpts[5][1]), (kpts[7][0], kpts[7][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[7][0], kpts[7][1]), (kpts[9][0], kpts[9][1]), (255, 255, 0), 2, 8, 0)
        # nose -> right_shoulder -> right_elbow -> right_wrist.(0, 6), (6, 8), (8, 10)
        cv.line(src, (kpts[0][0], kpts[0][1]), (kpts[6][0], kpts[6][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[6][0], kpts[6][1]), (kpts[8][0], kpts[8][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[8][0], kpts[8][1]), (kpts[10][0], kpts[10][1]), (255, 255, 0), 2, 8, 0)
        # left_shoulder -> left_hip -> left_knee -> left_ankle.(5, 11), (11, 13), (13, 15)
        cv.line(src, (kpts[5][0], kpts[5][1]), (kpts[11][0], kpts[11][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[11][0], kpts[11][1]), (kpts[13][0], kpts[13][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[13][0], kpts[13][1]), (kpts[15][0], kpts[15][1]), (255, 255, 0), 2, 8, 0)
        # right_shoulder -> right_hip -> right_knee -> right_ankle.(6, 12), (12, 14), (14, 16)
        cv.line(src, (kpts[6][0], kpts[6][1]), (kpts[12][0], kpts[12][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[12][0], kpts[12][1]), (kpts[14][0], kpts[14][1]), (255, 255, 0), 2, 8, 0)
        cv.line(src, (kpts[14][0], kpts[14][1]), (kpts[16][0], kpts[16][1]), (255, 255, 0), 2, 8, 0)
        for x, y, _, in kpts:
            cv.circle(src, (int(x), int(y)), 3, (0, 0, 255), 2, 8, 0)

    index += 1
cv.imshow("KeyPointRCNN Detection Demo", src)
cv.waitKey(0)
cv.destroyAllWindows()